Lab 03 - Distributional and relational visualization¶

Used libraries¶

  1. pandas
    pip install pandas

  2. plotly
    pip install plotly

  3. pycountry - for getting country codes for geoPlot
    pip install pycountry

  4. geopandas - for working with geo plots
    pip install geopandas

In [ ]:
# imports
import pandas as pd 

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import numpy as np
import os
import pycountry
# import geopandas as gpd

import plotly.io as pio
pio.renderers.default = "notebook+pdf"

The university dataset has many .csv files.

For this experiment we will be exploring only the cwurData.csv and timesData.csv

In [ ]:
def loadCSVData(path):
    '''
        Function helps load the given path name if it is a CSV file into a DataFrame using pandas
    '''
    if os.path.splitext(path)[1] == ".csv": # read only csv files from the dataset
        df = pd.read_csv(path, delimiter=',')

        return df
    return None

Dataset timesData.csv for other tasks¶

In [ ]:
# loading timesData.csv
timesUniData = loadCSVData("../world_university_ranking/timesData.csv")

# info on columns
print(timesUniData.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2603 entries, 0 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2603 non-null   object 
 1   university_name         2603 non-null   object 
 2   country                 2603 non-null   object 
 3   teaching                2603 non-null   float64
 4   international           2603 non-null   object 
 5   research                2603 non-null   float64
 6   citations               2603 non-null   float64
 7   income                  2603 non-null   object 
 8   total_score             2603 non-null   object 
 9   num_students            2544 non-null   object 
 10  student_staff_ratio     2544 non-null   float64
 11  international_students  2536 non-null   object 
 12  female_male_ratio       2370 non-null   object 
 13  year                    2603 non-null   int64  
dtypes: float64(4), int64(1), object(9)
memory usage: 284.8+ KB
None
In [ ]:
timesUniData.isnull().sum()
Out[ ]:
world_rank                  0
university_name             0
country                     0
teaching                    0
international               0
research                    0
citations                   0
income                      0
total_score                 0
num_students               59
student_staff_ratio        59
international_students     67
female_male_ratio         233
year                        0
dtype: int64
In [ ]:
before = timesUniData.shape[0]
timesUniData = timesUniData.dropna()

print(f"Dropped {before - timesUniData.shape[0]} NaN values")
print(timesUniData.isnull().sum())
Dropped 241 NaN values
world_rank                0
university_name           0
country                   0
teaching                  0
international             0
research                  0
citations                 0
income                    0
total_score               0
num_students              0
student_staff_ratio       0
international_students    0
female_male_ratio         0
year                      0
dtype: int64
In [ ]:
def parseNumberOfStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace(',', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)

def parseGenderRatio(df : pd.Series) -> list[float]:
    lst = []
    ratio = 0

    if df.dtype == object:
        for ind, item in df.iteritems():
            try:
                tmp = list(map(int, item.split(":") ) )
                lst.append(tmp[0]) 
            except ZeroDivisionError:
                lst.append(100)
            except:
                lst.append(0) # no data

        return lst
    else:
        return list(df)

# international student is given in the format 27% so we can just remove the % to
# convert it into a numeric data type
def parseInternationalStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace('%', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)
In [ ]:
# here since number of students and gender ratio is given in strings we need to
# convert them into numeric types

num_stud = parseNumberOfStudents(timesUniData.num_students)
timesUniData.num_students = num_stud
pd.to_numeric(timesUniData.num_students)

gender_ratio = parseGenderRatio(timesUniData.female_male_ratio)
timesUniData.female_male_ratio = gender_ratio
pd.to_numeric(timesUniData.female_male_ratio)

international_students = parseInternationalStudents(
    timesUniData.international_students)
timesUniData.international_students = international_students
pd.to_numeric(timesUniData.international_students)

print(timesUniData.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2362 entries, 1 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2362 non-null   object 
 1   university_name         2362 non-null   object 
 2   country                 2362 non-null   object 
 3   teaching                2362 non-null   float64
 4   international           2362 non-null   object 
 5   research                2362 non-null   float64
 6   citations               2362 non-null   float64
 7   income                  2362 non-null   object 
 8   total_score             2362 non-null   object 
 9   num_students            2362 non-null   float64
 10  student_staff_ratio     2362 non-null   float64
 11  international_students  2362 non-null   float64
 12  female_male_ratio       2362 non-null   int64  
 13  year                    2362 non-null   int64  
dtypes: float64(6), int64(2), object(6)
memory usage: 276.8+ KB
None
In [ ]:
# plot 1 
# Student staff ratio in the year 2015-2016
year = [2015, 2016]

uniData = timesUniData.query(f"year in {year}")
uni2015 = timesUniData[timesUniData["year"] == 2015]
uni2016 = timesUniData[timesUniData["year"] == 2016]

hist1 = go.Histogram(x = uni2015.student_staff_ratio, name='2015',
        marker = dict(color = 'rgba(119, 157, 230, 0.8)' ) )

hist2 = go.Histogram(x = uni2016.student_staff_ratio, name = '2016', 
        marker = dict(color = 'rgba(220, 112, 92, 0.87)' ) )


layout = dict(title = 'Student staff ratio in year 2015-2016', title_x = 0.5, barmode='overlay',
	xaxis_title = 'Count', yaxis_title = 'Student-staff ratio')

fig = go.Figure(data = [hist1, hist2], layout = layout)

fig.update_layout(title = 'Student staff ratio in year 2015-2016', title_x = 0.3, barmode='overlay',
xaxis_title = 'Count', yaxis_title = 'Student-staff ratio', font_size = 16)
fig.update_traces(opacity = .5)

# fig.show(renderer='browser', auto_open=True)
fig.show()
In [ ]:
# plot 2
# compare the teaching with world ranking for the top 50 universities
year = 2016
num_uni = 30

topUnis = timesUniData[timesUniData.year == year].iloc[: num_uni]

colors = [float(item) for item in topUnis.international]
num_students = topUnis.num_students

data = go.Scatter(x = topUnis.world_rank, y = topUnis.teaching,
	mode = 'markers+text',
	marker = dict(
			color = colors,
			size = num_students,
			sizeref= (5.0 * max(num_students) ) / (25.**2),
			showscale = True
	),
	text = topUnis.university_name, 
	textfont=dict(
        family="sans serif",
        size=12,
		color='#2e2e2d'
    ))

layout = dict(xaxis_title = 'World ranking', yaxis_title='Teaching score', legend_title ='International score',
        title = f"Worlds top {num_uni} univeristy with number of students(size) and international score(scale) - {year}",
		font = dict(
			family="Calibri",
			size=16,
			color="RebeccaPurple"
		))

fig = go.Figure(data = data, layout = layout)
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

# fig.show(renderer='browser', auto_open=True)
fig.show()
In [ ]:
# for geo plots we need the iso alpha country
# so we need to get country codes
jsonUrl = 'https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.json'

countyDf = pd.read_json(jsonUrl)

countryCodes = dict(zip(countyDf.name, countyDf["alpha-3"]))

# since some countries dont have regional codes added like
# UK, South Korea, Hong Kong, Taiwan, Czech Republic, Iran etc.
# we add them manually
countryCodes['United Kingdom'] = 'GBR'
countryCodes['Hong Kong'] = 'HKG'
countryCodes["South Korea"] = 'KOR'
countryCodes['Republic of Ireland'] = 'IRL'
countryCodes['Taiwan'] = 'TWN'
countryCodes['Czech Republic'] = 'CZE'
countryCodes['Iran'] = 'IRN'
countryCodes['Macau'] = 'MAC'

# print(countryCodes["United Kingdom"])
In [ ]:
# plot 3
# geo plot of the distribution of the universities in the world for the year 2015
year = 2015

def getUniCount(df : pd.DataFrame) -> pd.DataFrame:
    countryCount = {}

    for ind, row in df.iterrows():
        cname = row["country"]

        if cname in countryCount:
            countryCount[cname] += 1
        else:
            countryCount[cname] = 1

    countDf = pd.DataFrame(columns=['Country', 'Number of university', 'alpha3_code']) # create a dataframe

    for key, value in countryCount.items():
        appendVals = [key, value, countryCodes[key]]
        countDf.loc[len(countDf)] = appendVals # append the values to the dataframe

    countDf["Number of university"] = pd.to_numeric(countDf["Number of university"])

    return countDf.sort_values(by=["Number of university"], ascending=False) # sort by number of university in descending order

uniData = timesUniData[timesUniData["year"] == year]
uniCountData = getUniCount(uniData)


# fig = px.scatter_geo(uniCountData, locations="alpha3_code", color="Country", hover_name="Country",
#         projection="natural earth", size="Number of university" )

fig = px.choropleth(uniCountData, locations="alpha3_code", color="Number of university", hover_name="Country",
        color_continuous_scale="viridis")

fig.update_layout(title = f"Geo plot of the distribution of the universities in the world for the year {year}",
    title_x = .5, title_font_size = 20)

# fig.show(renderer='browser', auto_open=True)
# fig.write_image('./geo_plot.svg')
fig.show()
In [ ]:
# plot 4
# world rank vs citations of the top 100 unviersity of year 2013, 2014, 2015

numUni = 100

# since world rankings are not given in pure int64 format we use a cheaty way of solving
# by generating rankings list for 1 to numUni

rankingList = [i for i in range(1, numUni + 1) ]

uni2013 = timesUniData[timesUniData["year"] == 2013].iloc[:numUni,]
uni2014 = timesUniData[timesUniData["year"] == 2014].iloc[:numUni,]
uni2015 = timesUniData[timesUniData["year"] == 2015].iloc[:numUni,]

print(uni2013)

scatter2013 = go.Scatter(x = rankingList, y = uni2013.citations, name = '2014', 
        text = uni2013.university_name, mode = 'markers', marker = dict(color = 'rgba(119, 157, 230, 0.8)' ) )

scatter2014 = go.Scatter(x = rankingList, y = uni2014.citations, name = '2014', 
        text = uni2014.university_name, mode = 'markers', marker = dict(color = 'rgba(82, 84, 80, .8)' ) )

scatter2015 = go.Scatter(x = rankingList, y = uni2015.citations, name = '2015',
        text = uni2015.university_name, mode = 'markers', marker = dict(color = 'rgba(220, 112, 92, 0.87)' ) )
    

data = [scatter2013, scatter2014, scatter2015]

layout = dict(title = 'Citations vs world ranking for the top 100 univeristies of year 2013, 2014 and 2015',
             legend_title = 'Year', title_x = 0.5, title_font_size = 20,
             xaxis = dict(title = 'World ranking', ticklen = 20),
             yaxis = dict(title = "Citations", ticklen = 20))

fig = go.Figure(data = data, layout = layout)
# fig.show(renderer='browser', auto_open=True)
fig.show()
    world_rank                        university_name  \
602          1     California Institute of Technology   
603          2                    Stanford University   
604          2                   University of Oxford   
606          5  Massachusetts Institute of Technology   
607          6                   Princeton University   
..         ...                                    ...   
712        110                University of Sheffield   
713        110                   University of Sussex   
714        113                University of Cape Town   
715        114     Eindhoven University of Technology   
716        115                  Maastricht University   

                      country  teaching international  research  citations  \
602  United States of America      96.3          59.8      99.4       99.7   
603  United States of America      95.0          56.6      98.8       99.3   
604            United Kingdom      89.7          88.7      98.1       95.6   
606  United States of America      92.9          81.6      89.2       99.9   
607  United States of America      89.5          54.5      99.4       99.8   
..                        ...       ...           ...       ...        ...   
712            United Kingdom      49.5          68.1      46.2       71.0   
713            United Kingdom      34.9          76.7      39.5       91.1   
714              South Africa      34.7          75.1      45.5       79.7   
715               Netherlands      44.1          69.8      51.8       63.8   
716               Netherlands      39.7          85.2      53.2       62.5   

    income total_score  num_students  student_staff_ratio  \
602   95.6        95.5        2243.0                  6.9   
603   62.4        93.7       15596.0                  7.8   
604   79.8        93.7       19919.0                 11.6   
606   92.9        93.1       11074.0                  9.0   
607   79.5        92.7        7929.0                  8.4   
..     ...         ...           ...                  ...   
712   41.5        56.2       23311.0                 15.5   
713   32.1        56.2       12001.0                 17.4   
714   87.3        55.8       20040.0                 12.1   
715  100.0        55.6        8176.0                 16.0   
716   99.1        55.5       15626.0                 18.9   

     international_students  female_male_ratio  year  
602                    27.0                 33  2013  
603                    22.0                 42  2013  
604                    34.0                 46  2013  
606                    33.0                 37  2013  
607                    27.0                 45  2013  
..                      ...                ...   ...  
712                    31.0                 50  2013  
713                    35.0                 54  2013  
714                    18.0                 53  2013  
715                    14.0                 19  2013  
716                    48.0                 56  2013  

[100 rows x 14 columns]